#!/bin/bash

# SAS Util: Tool to list SAS drives / controllers on the system, and potentially
# measure various SAS drives' response to "set standby_z" command.
#
# Some drives respond "as expected", which is a temporary spindle spin-down
# which is terminated (spin up) by the next i/o against the drive. Others
# go to permanent idle state which require an explicit "start" command. 
# Yet others just ignore the command.
#
# Currently (2020-10-10) it appears this is not just drive dependent, but
# could be controller-dependent or maybe even combo-dependent. This tool
# was built to collect this info.
#
# Unraid environment is assumed.
#
# (c) 2020-2024 @doron - CC BY-SA 4.0

VERSION=20240218.01 # Tool version

SMARTCTL=/usr/sbin/smartctl
SDPARM=/usr/sbin/sdparm
LSPCI=/sbin/lspci
MDCMD=/usr/local/sbin/mdcmd
SG_START=/usr/bin/sg_start
DD=/usr/bin/dd
JQ=/usr/bin/jq
SAS_SPUNDOWN_SENSE_RE='(standby condition activated|notify \(enable spinup\) required)'

DEBUG=false
#DEBUG=true

ME=$(basename $0)
MYPID=$$

UNRAIDVAR=/var/local/emhttp
UNRAIDVERSION=$(cat /etc/unraid-version | cut -d '"' -f2)

OUTPUTJSON=/tmp/sas-util-out


Main () {

Tell "SAS Spindown Utility (v$VERSION)"

if [[ "${1,,}" == "test" ]] ; then
  TESTING=true
else
  TESTING=false
fi

trap "Cleanup" EXIT

shopt -s nocasematch

###############
# Collect all mechanical SAS drives
SASDRIVES=""

for DISK in $(ls -la /dev/disk/by-path/* | egrep -v "part[0-9]+ " | sed 's/.*\///') ; do

  if IsSAS $DISK && IsRotational $DISK ; then 
	SASDRIVES+=" $DISK" 
  fi

done

if [ "$SASDRIVES" == "" ] ; then
  ScramWith "No SAS drives detected."
fi

if $TESTING ; then

  ###############
  # We do testing only when the array is stopped
  if IsArrayStarted ; then
    ScramWith "SAS spindown testing must be run with the array stopped."
  fi

  Tell "SAS spin-down testing will be performed on: $SASDRIVES. Please confirm!"
  Yesno

  InstallHook		Install a blocker to "smartctl"

fi

###############

SUMMARY="" ; JSONDATA="" ; CONTROLLERS=""

for DISK in $SASDRIVES ; do

  MODEL=$(cat /sys/block/$DISK/device/model | Trim)

  CTRLID=$(
	cd $(readlink -m /sys/block/$DISK | sed 's=/host[0-9a-f].*==')	# e.g. /sys/devices/pci0000:00/0000:00:17.0/0000:13:00.0
	cat vendor device subsystem_vendor subsystem_device | 
	paste -sd':' |
	sed 's/0x//g'							# 0x1234 => 1234
	)
  CONTROLLERS+=" $(GetCtrlSlot $DISK)"

  TIME="" ; RESULT=" n/a "

  if $TESTING ; then

    Tellnr "Testing $DISK ($MODEL): "

    ## First, make sure the drive is spun up
    Tellnr "Ensuring drive is spinning..."
    $DD if=/dev/$DISK of=/dev/null bs=4K count=1 &> /dev/null
    sleep 1s

    ## This should spin down the device
    Tellnr "Issuing the spin-down command, and waiting a few seconds..."
    $SG_START --pc=3 --readonly /dev/$DISK
    sleep 3s

    SENSE1=$(sdparm -C sense /dev/$DISK 2>&1 | Oneliner)
    
    if IsSBY $DISK ; then
	  Tellnr "...spun down. Issuing i/o against drive..."

	  echo 1 > /proc/sys/vm/drop_caches	# Make sure Linux not reading from cache

	  STIME=$(date +%s%N)
	  # Copy some data, randomizing read point to avert controller caching
	  $DD if=/dev/$DISK of=/dev/null bs=4K count=1 skip=$(( $RANDOM * 1024 + 1 )) &> /dev/null
	  DDRC=$?
	  ETIME=$(date +%s%N)

	  TIME=$(( ($ETIME - $STIME) / 1000000 ))	# From nanosec to milisec

	  if [ $DDRC != 0 ] ; then
		  RESULT="failed"
		  Tellnr "i/o to drive seems to fail - restarting..."
    	  	  $SG_START --pc=1 --readonly /dev/$DISK
	  elif [ $TIME -lt 300 ] ; then
		  RESULT="toofast"
	  else
		  RESULT="success"
	  fi

    elif [[ "$SENSE1" =~ "logical unit not ready" ]] ; then
	  Tellnr "Drive seems to have STOPped; restarting..."
	  RESULT=stopping
    	  $SG_START --pc=1 --readonly /dev/$DISK
          SENSE2=$(sdparm -C sense /dev/$DISK 2>&1 | Oneliner)
	  if [[ "$SENSE2" =~ "logical unit not ready" ]] ; then
		Tellnr "Restart with pc=1 does not do it, trying 'start'..."
		SG_START --start --readonly /dev/$DISK
		SENSE2=$(sdparm -C sense /dev/$DISK 2>&1 | Oneliner)
	  	if [[ "$SENSE2" =~ "logical unit not ready" ]] ; then
			Tellnr "Failed to restart device, reboot will be needed."
		else
			Tellnr "Seems to have restarted with 'start'..."
			RESULT+="; restarted2"
		fi
	  else
		Tellnr "Seems to have restarted with pc=1..."
		RESULT+="; restarted1"
	  fi


    else
	  Tellnr "Drive does not seem to spin down..."
	  RESULT=ignored

    fi
    Tellnr "...all done.\n\n"
  fi

  SUMMARY+=$(echo -e "\n$DISK\t| $MODEL\t| $CTRLID\t| $RESULT\t| $TIME")

  JSONDATA+="{ \"drive\": {  \"model\": \"$MODEL\",  
	\"sdparm-i\": \"$(echo "$($SDPARM -i /dev/$DISK 2>&1)|RC=$?" | Oneliner)\",
	\"controller-id\": \"$CTRLID\",  
	\"controller-slot\": \"$(GetCtrlSlot $DISK)\"  " 

  if $TESTING ; then 
	JSONDATA+=", 
	\"sense\": \"$SENSE1\",   
	\"result\": \"$RESULT\", 
	\"time\": \"$TIME\"  " 
  fi

  JSONDATA+=" } },"

done

JSONDATA=${JSONDATA%,}		# Remove trailing comma


Tell "\n$SUMMARY"

## Tell "\n\nIf you would you like to add any text to the test results, please do so now:"
## read EXTRATEXT
## EXTRATEXT=$(tr "\'\"" ' ' <<< $EXTRATEXT)

CONTROLLERS=$(echo "$CONTROLLERS" | tr ' ' '\n' | sort | uniq)


$JQ . <<< $(echo "{\"utility-run\": { \"date\": \"$(date +"%Y%m%d-%H:%M %Z")\", \"version\": \"$VERSION\" , \"Unraid version\": \"$UNRAIDVERSION\" , \"message\":\"$EXTRATEXT\", \"drives\": 
	[ $JSONDATA ] } }"
	) > $OUTPUTJSON


# JSON-ize lspci for SAS controllers :-)
$JQ . <<< $(echo "{ \"controllers\": [ 
	$(for C in $CONTROLLERS ; do 
		echo -n "{ \"controller\": { 
			$($LSPCI -vvqnnmms $C | 
			sed -r 's/(.+):\s+(.*)/\"\1\": \"\2\",/' | 
			paste -sd ' ' | 
			sed -e 's/,\s*$//') } } ,"
	done | sed -e 's/,$//') ] }"
	) >> $OUTPUTJSON

Tell "Run completed. The output is at $OUTPUTJSON."

}




########## Functions

# Trim enclosing whitespaces off a string
Trim () { sed -e 's/^\s*//' -e 's/\s*$//' ; }

# Turn multiline into one liner with | between lines and spaces squeezed
Oneliner () {

    echo $(paste -sd '|' |			# echo used to squeeze out spaces
		sed -e 's/\"//g' -e 's/|$//')	# Remove quotes if any and trailing |

}

# Exit with error message
ScramWith () {

  Tell "Error: $*"
  Tell "Now exiting."
  exit 2

}

# Issue message to user
Tell() {
  echo -e "\n$*"
}

# Issue message to user w/o newlines
Tellnr() {
  echo -en "$*"
}

# Ask for confirmation
Yesno() {
  trap "$(shopt -p nocasematch)" RETURN
  shopt -u nocasematch
  local RESP
  Tellnr "\nType upper-case YES to proceed:"
  read RESP
  [[ "$RESP" == "YES" ]] && return
  ScramWith "Mission aborted by user."
}

# Is this a SAS device?
IsSAS () {
        trap "$(shopt -p nocasematch)" RETURN
        shopt -s nocasematch
        [[ $($SDPARM -ip di_target /dev/${1#'/dev/'} 2>&1) =~ transport:\ serial\ attached\ scsi ]]
}

# Is this a rotational (vs. solid state) device?
IsRotational () {
        [ "$(cat /sys/block/${1#'/dev/'}/queue/rotational)" == 1 ]
}

# Is this SAS device spun down (standby_z)?
IsSBY () {
        trap "$(shopt -p nocasematch)" RETURN
        shopt -s nocasematch
        [[ $($SDPARM --command=sense /dev/${1#'/dev/'} 2>&1) =~ $SAS_SPUNDOWN_SENSE_RE ]]
}

# Get controller slot ID from an rdevice
GetCtrlSlot () {

  readlink -m /sys/block/${1#'/dev/'} |
        sed -e 's=.*devices/pci[0-9a-f:]*/[0-9a-f:.]\+/[0-9a-f]*:==' -e 's=/.*=='
}

IsArrayStarted () {

  ls /mnt/disk[1-9]* &>/dev/null ||
	[[ ! $(grep -i '^mdstate' $UNRAIDVAR/var.ini) =~ .*=\"STOPPED\" ]]
}

# Install a "smartctl" plug, to stop Unraid's SMART polls from intefering with our testing
InstallHook () {

  $DEBUG && Tell "About to install smartctl hook. My PID=$MYPID"

  cat << EOF > ${SMARTCTL}.hook.$MYPID
#!/bin/bash

# This is a temprary smartctl blocker. Should be in place only for the duration
# of the sas-util test. If it stayed behind, there's a problem... :-(

# Note - most bash variables are evaluated before file is written; only escaped $ is retained.

if kill -0 $MYPID 2>/dev/null ; then	# Is the calling process still there?

  echo "smartctl is temporarily unavailable... SAS testing is in progress. Please try again later"
  exit 1

else

  logger -t "$ME" "smartctl blocker inadvertently left behind"
  mv ${SMARTCTL}.suspended.$MYPID ${SMARTCTL} && {
    logger -t "$ME" "smartctl blocker sucessfully self-removed"
    ${SMARTCTL} "\$@"
    }

fi
EOF

  chmod 755 ${SMARTCTL}.hook.$MYPID
  mv ${SMARTCTL} ${SMARTCTL}.suspended.$MYPID && 
    mv ${SMARTCTL}.hook.$MYPID ${SMARTCTL}

}

Cleanup () {

  $DEBUG && Tell "Cleaning up"

  if [ -f ${SMARTCTL}.suspended.$MYPID ] ; then

    mv ${SMARTCTL}.suspended.$MYPID ${SMARTCTL}

  fi

}

Main "$@"
